df <- read.csv("/Users/dshukla/MS/Regression/project/car_dekho.csv")
library(stringr)
library(ggplot2)
library(gridExtra)
library(grid)
library(car)
## Loading required package: carData
head(df); str(df)
## 'data.frame':    8128 obs. of  13 variables:
##  $ name         : chr  "Maruti Swift Dzire VDI" "Skoda Rapid 1.5 TDI Ambition" "Honda City 2017-2020 EXi" "Hyundai i20 Sportz Diesel" ...
##  $ year         : int  2014 2014 2006 2010 2007 2017 2007 2001 2011 2013 ...
##  $ selling_price: int  450000 370000 158000 225000 130000 440000 96000 45000 350000 200000 ...
##  $ km_driven    : int  145500 120000 140000 127000 120000 45000 175000 5000 90000 169000 ...
##  $ fuel         : chr  "Diesel" "Diesel" "Petrol" "Diesel" ...
##  $ seller_type  : chr  "Individual" "Individual" "Individual" "Individual" ...
##  $ transmission : chr  "Manual" "Manual" "Manual" "Manual" ...
##  $ owner        : chr  "First Owner" "Second Owner" "Third Owner" "First Owner" ...
##  $ mileage      : chr  "23.4 kmpl" "21.14 kmpl" "17.7 kmpl" "23.0 kmpl" ...
##  $ engine       : chr  "1248 CC" "1498 CC" "1497 CC" "1396 CC" ...
##  $ max_power    : chr  "74 bhp" "103.52 bhp" "78 bhp" "90 bhp" ...
##  $ torque       : chr  "190Nm@ 2000rpm" "250Nm@ 1500-2500rpm" "12.7@ 2,700(kgm@ rpm)" "22.4 kgm at 1750-2750rpm" ...
##  $ seats        : int  5 5 5 5 5 5 5 4 5 5 ...

Data Preproccessing

library(tidyr)

df$engine_cc <- as.numeric(str_split_fixed(df$engine, " ", 2)[,1])
df$max_power_bhp <- as.numeric(str_split_fixed(df$max_power, " ", 2)[,1])
df$mileage_kmpl <- as.numeric(str_split_fixed(df$mileage, " ", 2)[,1])
df$brand <- str_split_fixed(df$name, " ", 2)[,1]
df$age=  2021- as.numeric(df$year)

EDA

variables <- c("age","brand","selling_price","km_driven","fuel","seller_type","transmission","owner","seats","engine_cc","max_power_bhp","mileage_kmpl")
df_clean <- df[,variables]

str(df_clean)
## 'data.frame':    8128 obs. of  12 variables:
##  $ age          : num  7 7 15 11 14 4 14 20 10 8 ...
##  $ brand        : chr  "Maruti" "Skoda" "Honda" "Hyundai" ...
##  $ selling_price: int  450000 370000 158000 225000 130000 440000 96000 45000 350000 200000 ...
##  $ km_driven    : int  145500 120000 140000 127000 120000 45000 175000 5000 90000 169000 ...
##  $ fuel         : chr  "Diesel" "Diesel" "Petrol" "Diesel" ...
##  $ seller_type  : chr  "Individual" "Individual" "Individual" "Individual" ...
##  $ transmission : chr  "Manual" "Manual" "Manual" "Manual" ...
##  $ owner        : chr  "First Owner" "Second Owner" "Third Owner" "First Owner" ...
##  $ seats        : int  5 5 5 5 5 5 5 4 5 5 ...
##  $ engine_cc    : num  1248 1498 1497 1396 1298 ...
##  $ max_power_bhp: num  74 103.5 78 90 88.2 ...
##  $ mileage_kmpl : num  23.4 21.1 17.7 23 16.1 ...

Check for NA’s

num_cols <- df_clean[,sapply(df_clean,is.numeric)]
summary(num_cols)
##       age         selling_price        km_driven           seats       
##  Min.   : 1.000   Min.   :   29999   Min.   :      1   Min.   : 2.000  
##  1st Qu.: 4.000   1st Qu.:  254999   1st Qu.:  35000   1st Qu.: 5.000  
##  Median : 6.000   Median :  450000   Median :  60000   Median : 5.000  
##  Mean   : 7.196   Mean   :  638272   Mean   :  69820   Mean   : 5.417  
##  3rd Qu.:10.000   3rd Qu.:  675000   3rd Qu.:  98000   3rd Qu.: 5.000  
##  Max.   :38.000   Max.   :10000000   Max.   :2360457   Max.   :14.000  
##                                                        NA's   :221     
##    engine_cc    max_power_bhp     mileage_kmpl  
##  Min.   : 624   Min.   :  0.00   Min.   : 0.00  
##  1st Qu.:1197   1st Qu.: 68.05   1st Qu.:16.78  
##  Median :1248   Median : 82.00   Median :19.30  
##  Mean   :1459   Mean   : 91.52   Mean   :19.42  
##  3rd Qu.:1582   3rd Qu.:102.00   3rd Qu.:22.32  
##  Max.   :3604   Max.   :400.00   Max.   :42.00  
##  NA's   :221    NA's   :216      NA's   :221
na.val  <- lapply(df_clean,function(x) mean(is.na(x)) * 100)
na.df = data.frame( variable = names(na.val), 
            percentage.na =round(as.numeric( sapply(na.val, "[", 1) ),2))
print(na.df)
##         variable percentage.na
## 1            age          0.00
## 2          brand          0.00
## 3  selling_price          0.00
## 4      km_driven          0.00
## 5           fuel          0.00
## 6    seller_type          0.00
## 7   transmission          0.00
## 8          owner          0.00
## 9          seats          2.72
## 10     engine_cc          2.72
## 11 max_power_bhp          2.66
## 12  mileage_kmpl          2.72
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following object is masked from 'package:car':
## 
##     recode
## The following object is masked from 'package:gridExtra':
## 
##     combine
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
imp_brand <- df_clean %>% group_by(brand)%>% count() %>%arrange(desc(n)) %>% filter(n > 50)
df_model_clean <- df_clean[df_clean$brand %in% unique(imp_brand$brand),]
dim(df_model_clean)
## [1] 7928   12
df_model <- df_model_clean[!is.na(df_model_clean$seats) & !is.na(df_model_clean$max_power_bhp),]
dim(df_model)
## [1] 7713   12
check <- df_model_clean[is.na(df_model_clean$seats) & is.na(df_model_clean$max_power_bhp),]
head(check)
str(df_model_clean)
## 'data.frame':    7928 obs. of  12 variables:
##  $ age          : num  7 7 15 11 14 4 14 20 10 8 ...
##  $ brand        : chr  "Maruti" "Skoda" "Honda" "Hyundai" ...
##  $ selling_price: int  450000 370000 158000 225000 130000 440000 96000 45000 350000 200000 ...
##  $ km_driven    : int  145500 120000 140000 127000 120000 45000 175000 5000 90000 169000 ...
##  $ fuel         : chr  "Diesel" "Diesel" "Petrol" "Diesel" ...
##  $ seller_type  : chr  "Individual" "Individual" "Individual" "Individual" ...
##  $ transmission : chr  "Manual" "Manual" "Manual" "Manual" ...
##  $ owner        : chr  "First Owner" "Second Owner" "Third Owner" "First Owner" ...
##  $ seats        : int  5 5 5 5 5 5 5 4 5 5 ...
##  $ engine_cc    : num  1248 1498 1497 1396 1298 ...
##  $ max_power_bhp: num  74 103.5 78 90 88.2 ...
##  $ mileage_kmpl : num  23.4 21.1 17.7 23 16.1 ...

Categorical columns distribution

cat_cols <- df_model[,sapply(df_model,is.character)]
lapply(cat_cols, function(x) table(x))
## $brand
## x
##           BMW     Chevrolet        Datsun          Ford         Honda 
##           118           230            65           388           466 
##       Hyundai        Jaguar      Mahindra        Maruti Mercedes-Benz 
##          1360            71           758          2367            54 
##        Nissan       Renault         Skoda          Tata        Toyota 
##            81           228           104           719           452 
##    Volkswagen         Volvo 
##           185            67 
## 
## $fuel
## x
##    CNG Diesel    LPG Petrol 
##     52   4171     35   3455 
## 
## $seller_type
## x
##           Dealer       Individual Trustmark Dealer 
##             1031             6446              236 
## 
## $transmission
## x
## Automatic    Manual 
##       941      6772 
## 
## $owner
## x
##          First Owner Fourth & Above Owner         Second Owner 
##                 5069                  159                 1984 
##       Test Drive Car          Third Owner 
##                    2                  499
par(mfrow = c(1,2))
hist(df_model$selling_price, main= paste("selling_price"), breaks = 50,probability = TRUE)

hist(log(df_model$selling_price), main= paste("log of selling_price"), breaks = 50,probability = TRUE)

# library(hrbrthemes)
# 
# for  (feature in colnames(cat_cols)) {
#   
# p <-  (ggplot(data=df_model, aes(x= log(selling_price), group= df_model[,feature], fill=df_model[,feature])) +
#     geom_density() +
#     theme_ipsum()) 
# 
# print(p + scale_fill_discrete(name = feature))
# }
p1 <- ggplot(df_model, aes(x=brand, y=log(selling_price), fill=brand)) + 
  geom_boxplot()+labs(title=paste0("Selling Price  by brand"),x= 'brand', y = "Selling Price") + scale_fill_discrete(name = df_model$brand) + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))

p2 <- ggplot(df_model, aes(x=fuel, y=log(selling_price), fill=fuel)) + 
  geom_boxplot()+labs(title=paste0("Selling Price  by fuel type"),x= 'fuel', y = "Selling Price") + scale_fill_discrete(name = df_model$fuel) + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))

p3 <- ggplot(df_model, aes(x=owner, y=log(selling_price), fill=owner)) + 
  geom_boxplot()+labs(title=paste0("Selling Price  by owner"),x= 'owner', y = "Selling Price") + scale_fill_discrete(name = df_model$owner) + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))

p4 <- ggplot(df_model, aes(x=seller_type, y=log(selling_price), fill=seller_type)) + 
  geom_boxplot()+labs(title=paste0("Selling Price  by seller_type"),x= 'seller_type', y = "Selling Price") + scale_fill_discrete(name = df_model$seller_type) + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))

p5 <- ggplot(df_model, aes(x=transmission, y=log(selling_price), fill=transmission)) + 
  geom_boxplot()+labs(title=paste0("Selling Price  by transmission"),x= "transmission", y = "Selling Price")  + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))

grid.arrange(p2,p3,p4,p5,ncol = 2)

library(dplyr)
df_model%>%
group_by(brand)%>%
count()%>%
arrange(desc(n))%>%
filter(n >90)%>%
ggplot()+geom_col(aes(x=n,y=reorder(brand,n),fill=brand),show.legend = FALSE)+
geom_label(aes(y = reorder(brand,n), x = n, label = paste(round((n/sum(n))*100,2),'%')))+
labs(title = 'Percentage share of Brands',
     subtitle = '',
    x= 'Percentage Share',
    y='Company')

num_cols['log_sp'] <- log(num_cols$selling_price)
library(psych)
## 
## Attaching package: 'psych'
## The following object is masked from 'package:car':
## 
##     logit
## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha
pairs.panels(num_cols[,-1], 
             method = "pearson", # correlation method
             hist.col = "#00AFBB",
             density = T,  # show density plots
             ellipses = TRUE # show correlation ellipses
             )

library(ggpubr)
p1 <- ggplot(df_model[df_model$km_driven < 5e5,], aes(x = km_driven, y = log(selling_price))) + geom_point(size=2, shape=23) + geom_smooth(method = "lm", se=FALSE, color="red", formula = y ~ x, size=1,fullrange=TRUE) + stat_cor(method = "pearson")

p2 <- ggplot(df_model, aes(x = age, y = log(selling_price))) + geom_point(size=2, shape=23) + geom_smooth(method = "lm", se=FALSE, color="red", formula = y ~ x, size=1,fullrange=TRUE) + stat_cor(method = "pearson")

p3 <- ggplot(df_model, aes(x = seats, y = log(selling_price))) + geom_point(size=2, shape=23) + geom_smooth(method = "lm", se=FALSE, color="red", formula = y ~ x, size=1,fullrange=TRUE) + stat_cor(method = "pearson")

p4 <- ggplot(df_model, aes(x = engine_cc, y = log(selling_price))) + geom_point(size=2, shape=23) + geom_smooth(method = "lm", se=FALSE, color="red", formula = y ~ x, size=1,fullrange=TRUE) + stat_cor(method = "pearson")

p5 <- ggplot(df_model, aes(x = max_power_bhp, y = log(selling_price))) + geom_point(size=2, shape=23) + geom_smooth(method = "lm", se=FALSE, color="red", formula = y ~ x, size=1,fullrange=TRUE) + stat_cor(method = "pearson")

p6 <- ggplot(df_model, aes(x = mileage_kmpl, y = log(selling_price))) + geom_point(size=2, shape=23) + geom_smooth(method = "lm", se=FALSE, color="red", formula = y ~ x, size=1,fullrange=TRUE) + stat_cor(method = "pearson")

grid.arrange(p1,p2,p3,p4,p5,p6,ncol = 2)

# grid.arrange(p3,p4,ncol = 2)
# 
# grid.arrange(p5,p6,ncol = 2)
df_model[df_model$km_driven >= 1400000,]

Train and Test Split

## 75% of the sample size
smp_size <- floor(0.90 * nrow(df_model))

## set the seed to make your partition reproducible
set.seed(123)
train_ind <- sample(seq_len(nrow(df_model)), size = smp_size)

train <- df_model[train_ind, ]
test <- df_model[-train_ind, ]

dim(train)
## [1] 6941   12
dim(test)
## [1] 772  12

Base Model (M0)

library(nortest)
options(scipen = 999)
model0 <- lm(selling_price ~ .,data = train)
summary(model0)
## 
## Call:
## lm(formula = selling_price ~ ., data = train)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -3331487  -100659   -10525    84701  5618801 
## 
## Coefficients:
##                                   Estimate     Std. Error t value
## (Intercept)                  3061483.60573    86798.48811  35.271
## age                           -42063.61774     1397.05789 -30.109
## brandChevrolet              -2737092.62121    42481.78140 -64.430
## brandDatsun                 -2771706.50901    55744.85491 -49.721
## brandFord                   -2671164.01425    38924.12799 -68.625
## brandHonda                  -2662256.49132    37799.99259 -70.430
## brandHyundai                -2665701.48926    36798.52180 -72.440
## brandJaguar                 -1212964.80295    47790.23552 -25.381
## brandMahindra               -2650663.29388    39631.00892 -66.884
## brandMaruti                 -2579102.22228    37717.30825 -68.380
## brandMercedes-Benz          -1518746.65643    52397.13327 -28.985
## brandNissan                 -2690308.73525    49467.45167 -54.385
## brandRenault                -2686283.05223    42479.20797 -63.238
## brandSkoda                  -2698990.66527    45596.99991 -59.192
## brandTata                   -2772964.46671    37964.59638 -73.041
## brandToyota                 -2339451.06306    39565.94516 -59.128
## brandVolkswagen             -2747436.32080    41752.55927 -65.803
## brandVolvo                   -853489.66020    48455.86584 -17.614
## km_driven                         -0.92773        0.09071 -10.228
## fuelDiesel                    138921.82456    46927.21497   2.960
## fuelLPG                       142308.83427    71073.60365   2.002
## fuelPetrol                     23954.84944    47134.03953   0.508
## seller_typeIndividual         -53617.61075    12145.30067  -4.415
## seller_typeTrustmark Dealer   -59969.67963    25052.77826  -2.394
## transmissionManual            -91904.88281    15050.03862  -6.107
## ownerFourth & Above Owner     -10240.87886    26713.29725  -0.383
## ownerSecond Owner             -57342.34573     9361.62193  -6.125
## ownerTest Drive Car           636677.82502   214130.61187   2.973
## ownerThird Owner              -33109.30678    16056.32241  -2.062
## seats                            451.40330     6201.60825   0.073
## engine_cc                         22.71074       19.44683   1.168
## max_power_bhp                   6289.80492      213.68493  29.435
## mileage_kmpl                   -3892.63349     1657.66269  -2.348
##                                         Pr(>|t|)    
## (Intercept)                 < 0.0000000000000002 ***
## age                         < 0.0000000000000002 ***
## brandChevrolet              < 0.0000000000000002 ***
## brandDatsun                 < 0.0000000000000002 ***
## brandFord                   < 0.0000000000000002 ***
## brandHonda                  < 0.0000000000000002 ***
## brandHyundai                < 0.0000000000000002 ***
## brandJaguar                 < 0.0000000000000002 ***
## brandMahindra               < 0.0000000000000002 ***
## brandMaruti                 < 0.0000000000000002 ***
## brandMercedes-Benz          < 0.0000000000000002 ***
## brandNissan                 < 0.0000000000000002 ***
## brandRenault                < 0.0000000000000002 ***
## brandSkoda                  < 0.0000000000000002 ***
## brandTata                   < 0.0000000000000002 ***
## brandToyota                 < 0.0000000000000002 ***
## brandVolkswagen             < 0.0000000000000002 ***
## brandVolvo                  < 0.0000000000000002 ***
## km_driven                   < 0.0000000000000002 ***
## fuelDiesel                               0.00308 ** 
## fuelLPG                                  0.04529 *  
## fuelPetrol                               0.61131    
## seller_typeIndividual             0.000010269919 ***
## seller_typeTrustmark Dealer              0.01670 *  
## transmissionManual                0.000000001073 ***
## ownerFourth & Above Owner                0.70146    
## ownerSecond Owner                 0.000000000955 ***
## ownerTest Drive Car                      0.00296 ** 
## ownerThird Owner                         0.03924 *  
## seats                                    0.94198    
## engine_cc                                0.24291    
## max_power_bhp               < 0.0000000000000002 ***
## mileage_kmpl                             0.01889 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 301800 on 6908 degrees of freedom
## Multiple R-squared:  0.8325, Adjusted R-squared:  0.8317 
## F-statistic:  1073 on 32 and 6908 DF,  p-value: < 0.00000000000000022
par(mfrow = c(2,2))
plot(model0, which = 1:4)

ncvTest(model0)
## Non-constant Variance Score Test 
## Variance formula: ~ fitted.values 
## Chisquare = 35038.85, Df = 1, p = < 0.000000000000000222
ad.test(resid(model0))
## 
##  Anderson-Darling normality test
## 
## data:  resid(model0)
## A = 473.93, p-value < 0.00000000000000022
vif(model0)
##                   GVIF Df GVIF^(1/(2*Df))
## age           2.210890  1        1.486906
## brand         9.569932 16        1.073133
## km_driven     1.620535  1        1.273002
## fuel          2.806896  3        1.187694
## seller_type   1.540598  2        1.114095
## transmission  1.841868  1        1.357154
## owner         1.429861  4        1.045711
## seats         2.687588  1        1.639386
## engine_cc     6.969510  1        2.639983
## max_power_bhp 3.915497  1        1.978761
## mileage_kmpl  3.392585  1        1.841897

Transformation

Predictor

pt <- powerTransform(cbind(df_model$age,df_model$km_driven,df_model$engine_cc,df_model$max_power_bhp,(df_model$mileage_kmpl + 1)) ~ 1)
summary(pt)
## bcPower Transformations to Multinormality 
##    Est Power Rounded Pwr Wald Lwr Bnd Wald Upr Bnd
## Y1    0.2483        0.25       0.2182       0.2784
## Y2    0.3107        0.31       0.2940       0.3273
## Y3   -0.7397       -0.74      -0.7894      -0.6900
## Y4   -0.4298       -0.43      -0.4708      -0.3887
## Y5    1.1027        1.10       1.0471       1.1582
## 
## Likelihood ratio test that transformation parameters are equal to 0
##  (all log transformations)
##                                    LRT df                   pval
## LR test, lambda = (0 0 0 0 0) 6559.984  5 < 0.000000000000000222
## 
## Likelihood ratio test that no transformations are needed
##                                    LRT df                   pval
## LR test, lambda = (1 1 1 1 1) 15589.43  5 < 0.000000000000000222

Response

model_tra <- lm(selling_price ~ I(train$age ^0.25) + I(train$km_driven ^0.31)+  I(train$engine_cc^-0.74) + I(train$max_power_bhp^-.43) + I(train$mileage_kmpl^1.10) + train$brand + train$fuel + train$owner + train$seller_type, data = train)
boxCox(model_tra, lambda=seq(-2, 2, by=0.5))

ad.test(resid(model_tra))
## 
##  Anderson-Darling normality test
## 
## data:  resid(model_tra)
## A = 486.09, p-value < 0.00000000000000022
ncvTest(model_tra)
## Non-constant Variance Score Test 
## Variance formula: ~ fitted.values 
## Chisquare = 27863.8, Df = 1, p = < 0.000000000000000222

Transformed Model (model 1)

model_transf <- lm(log(selling_price) ~ I(train$age ^0.25) + I(train$km_driven ^0.31)+  I(train$engine_cc^-0.74) + I(train$max_power_bhp^-.43) + I(train$mileage_kmpl^1.10) + train$brand + train$fuel + train$owner + train$seller_type, data = train)

summary(model_transf)
## 
## Call:
## lm(formula = log(selling_price) ~ I(train$age^0.25) + I(train$km_driven^0.31) + 
##     I(train$engine_cc^-0.74) + I(train$max_power_bhp^-0.43) + 
##     I(train$mileage_kmpl^1.1) + train$brand + train$fuel + train$owner + 
##     train$seller_type, data = train)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.68999 -0.15032  0.02182  0.16612  1.27555 
## 
## Coefficients:
##                                      Estimate  Std. Error t value
## (Intercept)                        18.9737871   0.0599518 316.484
## I(train$age^0.25)                  -1.7939047   0.0223920 -80.114
## I(train$km_driven^0.31)            -0.0026144   0.0006590  -3.967
## I(train$engine_cc^-0.74)          -84.2468820   8.2372023 -10.228
## I(train$max_power_bhp^-0.43)      -12.6600487   0.3125336 -40.508
## I(train$mileage_kmpl^1.1)          -0.0001428   0.0009052  -0.158
## train$brandChevrolet               -1.1408342   0.0340716 -33.483
## train$brandDatsun                  -1.1415710   0.0461392 -24.742
## train$brandFord                    -0.9776257   0.0312521 -31.282
## train$brandHonda                   -0.8309816   0.0308041 -26.976
## train$brandHyundai                 -0.8654984   0.0292005 -29.640
## train$brandJaguar                  -0.0698798   0.0418248  -1.671
## train$brandMahindra                -0.8795154   0.0305598 -28.780
## train$brandMaruti                  -0.7930701   0.0298014 -26.612
## train$brandMercedes-Benz           -0.0381083   0.0459314  -0.830
## train$brandNissan                  -0.8912754   0.0413645 -21.547
## train$brandRenault                 -0.8740731   0.0342687 -25.506
## train$brandSkoda                   -0.8956728   0.0384576 -23.290
## train$brandTata                    -1.2590186   0.0301269 -41.791
## train$brandToyota                  -0.5033485   0.0314215 -16.019
## train$brandVolkswagen              -0.9400152   0.0342168 -27.472
## train$brandVolvo                   -0.2294982   0.0423882  -5.414
## train$fuelDiesel                    0.2594932   0.0411919   6.300
## train$fuelLPG                       0.1683162   0.0623226   2.701
## train$fuelPetrol                    0.0220839   0.0414781   0.532
## train$ownerFourth & Above Owner    -0.2608520   0.0232368 -11.226
## train$ownerSecond Owner            -0.0923323   0.0082316 -11.217
## train$ownerTest Drive Car           0.2222491   0.1877271   1.184
## train$ownerThird Owner             -0.1769434   0.0140122 -12.628
## train$seller_typeIndividual        -0.0439739   0.0105924  -4.151
## train$seller_typeTrustmark Dealer  -0.0353604   0.0218303  -1.620
##                                               Pr(>|t|)    
## (Intercept)                       < 0.0000000000000002 ***
## I(train$age^0.25)                 < 0.0000000000000002 ***
## I(train$km_driven^0.31)                 0.000073402337 ***
## I(train$engine_cc^-0.74)          < 0.0000000000000002 ***
## I(train$max_power_bhp^-0.43)      < 0.0000000000000002 ***
## I(train$mileage_kmpl^1.1)                      0.87468    
## train$brandChevrolet              < 0.0000000000000002 ***
## train$brandDatsun                 < 0.0000000000000002 ***
## train$brandFord                   < 0.0000000000000002 ***
## train$brandHonda                  < 0.0000000000000002 ***
## train$brandHyundai                < 0.0000000000000002 ***
## train$brandJaguar                              0.09481 .  
## train$brandMahindra               < 0.0000000000000002 ***
## train$brandMaruti                 < 0.0000000000000002 ***
## train$brandMercedes-Benz                       0.40675    
## train$brandNissan                 < 0.0000000000000002 ***
## train$brandRenault                < 0.0000000000000002 ***
## train$brandSkoda                  < 0.0000000000000002 ***
## train$brandTata                   < 0.0000000000000002 ***
## train$brandToyota                 < 0.0000000000000002 ***
## train$brandVolkswagen             < 0.0000000000000002 ***
## train$brandVolvo                        0.000000063635 ***
## train$fuelDiesel                        0.000000000317 ***
## train$fuelLPG                                  0.00694 ** 
## train$fuelPetrol                               0.59445    
## train$ownerFourth & Above Owner   < 0.0000000000000002 ***
## train$ownerSecond Owner           < 0.0000000000000002 ***
## train$ownerTest Drive Car                      0.23650    
## train$ownerThird Owner            < 0.0000000000000002 ***
## train$seller_typeIndividual             0.000033433733 ***
## train$seller_typeTrustmark Dealer              0.10532    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2645 on 6910 degrees of freedom
## Multiple R-squared:  0.8919, Adjusted R-squared:  0.8914 
## F-statistic:  1900 on 30 and 6910 DF,  p-value: < 0.00000000000000022
par(mfrow = c(2,2))
plot(model_transf, which = 1:4)

ncvTest(model_transf)
## Non-constant Variance Score Test 
## Variance formula: ~ fitted.values 
## Chisquare = 107.2202, Df = 1, p = < 0.000000000000000222
ad.test(resid(model_transf))
## 
##  Anderson-Darling normality test
## 
## data:  resid(model_transf)
## A = 24.528, p-value < 0.00000000000000022
vif(model_transf)
##                                  GVIF Df GVIF^(1/(2*Df))
## I(train$age^0.25)            2.520715  1        1.587676
## I(train$km_driven^0.31)      2.213487  1        1.487779
## I(train$engine_cc^-0.74)     8.103191  1        2.846610
## I(train$max_power_bhp^-0.43) 4.486294  1        2.118087
## I(train$mileage_kmpl^1.1)    2.856638  1        1.690159
## train$brand                  5.237449 16        1.053107
## train$fuel                   3.153256  3        1.210951
## train$owner                  1.414022  4        1.044256
## train$seller_type            1.487383  2        1.104347

model2

model1 <- lm(log(selling_price) ~ .,data = train)
summary(model1)
## 
## Call:
## lm(formula = log(selling_price) ~ ., data = train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.3324 -0.1422  0.0100  0.1589  1.7701 
## 
## Coefficients:
##                                   Estimate     Std. Error  t value
## (Intercept)                 12.99812307225  0.06975240296  186.347
## age                         -0.11357743584  0.00112269403 -101.165
## brandChevrolet              -0.88910954524  0.03413891646  -26.044
## brandDatsun                 -0.86895832328  0.04479729621  -19.398
## brandFord                   -0.62458897027  0.03127993954  -19.968
## brandHonda                  -0.47251440806  0.03037656959  -15.555
## brandHyundai                -0.53439724103  0.02957177454  -18.071
## brandJaguar                 -0.09943444766  0.03840485978   -2.589
## brandMahindra               -0.67859782519  0.03184799833  -21.307
## brandMaruti                 -0.48158143322  0.03031012338  -15.888
## brandMercedes-Benz          -0.12091519804  0.04210702321   -2.872
## brandNissan                 -0.56763036587  0.03975269268  -14.279
## brandRenault                -0.60800562600  0.03413684842  -17.811
## brandSkoda                  -0.56922536839  0.03664234689  -15.535
## brandTata                   -0.95627132244  0.03050884736  -31.344
## brandToyota                 -0.31663448806  0.03179571223   -9.958
## brandVolkswagen             -0.59921359521  0.03355290400  -17.859
## brandVolvo                  -0.11653870399  0.03893976904   -2.993
## km_driven                   -0.00000068040  0.00000007289   -9.334
## fuelDiesel                   0.31995684072  0.03771132516    8.484
## fuelLPG                      0.17988260526  0.05711567966    3.149
## fuelPetrol                   0.08489589000  0.03787753209    2.241
## seller_typeIndividual       -0.03623107511  0.00976012284   -3.712
## seller_typeTrustmark Dealer -0.01163701178  0.02013274105   -0.578
## transmissionManual          -0.06100781358  0.01209440834   -5.044
## ownerFourth & Above Owner   -0.14471406887  0.02146715587   -6.741
## ownerSecond Owner           -0.08229654556  0.00752312211  -10.939
## ownerTest Drive Car          0.32067409601  0.17207816691    1.864
## ownerThird Owner            -0.12306518514  0.01290307118   -9.538
## seats                        0.04059985615  0.00498369369    8.147
## engine_cc                    0.00020696456  0.00001562773   13.243
## max_power_bhp                0.00839131801  0.00017172001   48.866
## mileage_kmpl                 0.00050418568  0.00133211947    0.378
##                                         Pr(>|t|)    
## (Intercept)                 < 0.0000000000000002 ***
## age                         < 0.0000000000000002 ***
## brandChevrolet              < 0.0000000000000002 ***
## brandDatsun                 < 0.0000000000000002 ***
## brandFord                   < 0.0000000000000002 ***
## brandHonda                  < 0.0000000000000002 ***
## brandHyundai                < 0.0000000000000002 ***
## brandJaguar                             0.009643 ** 
## brandMahindra               < 0.0000000000000002 ***
## brandMaruti                 < 0.0000000000000002 ***
## brandMercedes-Benz                      0.004096 ** 
## brandNissan                 < 0.0000000000000002 ***
## brandRenault                < 0.0000000000000002 ***
## brandSkoda                  < 0.0000000000000002 ***
## brandTata                   < 0.0000000000000002 ***
## brandToyota                 < 0.0000000000000002 ***
## brandVolkswagen             < 0.0000000000000002 ***
## brandVolvo                              0.002774 ** 
## km_driven                   < 0.0000000000000002 ***
## fuelDiesel                  < 0.0000000000000002 ***
## fuelLPG                                 0.001643 ** 
## fuelPetrol                              0.025037 *  
## seller_typeIndividual                   0.000207 ***
## seller_typeTrustmark Dealer             0.563273    
## transmissionManual          0.000000466795935124 ***
## ownerFourth & Above Owner   0.000000000016978633 ***
## ownerSecond Owner           < 0.0000000000000002 ***
## ownerTest Drive Car                     0.062429 .  
## ownerThird Owner            < 0.0000000000000002 ***
## seats                       0.000000000000000441 ***
## engine_cc                   < 0.0000000000000002 ***
## max_power_bhp               < 0.0000000000000002 ***
## mileage_kmpl                            0.705083    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2425 on 6908 degrees of freedom
## Multiple R-squared:  0.9091, Adjusted R-squared:  0.9087 
## F-statistic:  2160 on 32 and 6908 DF,  p-value: < 0.00000000000000022
par(mfrow = c(2,2))
plot(model1, which = 1:4)

ncvTest(model1)
## Non-constant Variance Score Test 
## Variance formula: ~ fitted.values 
## Chisquare = 319.8596, Df = 1, p = < 0.000000000000000222
ad.test(resid(model1))
## 
##  Anderson-Darling normality test
## 
## data:  resid(model1)
## A = 15.926, p-value < 0.00000000000000022
vif(model1)
##                   GVIF Df GVIF^(1/(2*Df))
## age           2.210890  1        1.486906
## brand         9.569932 16        1.073133
## km_driven     1.620535  1        1.273002
## fuel          2.806896  3        1.187694
## seller_type   1.540598  2        1.114095
## transmission  1.841868  1        1.357154
## owner         1.429861  4        1.045711
## seats         2.687588  1        1.639386
## engine_cc     6.969510  1        2.639983
## max_power_bhp 3.915497  1        1.978761
## mileage_kmpl  3.392585  1        1.841897

Variable Selection

Backward BIC

mod.0 <- lm(log(selling_price) ~ 1, data = train)
#step(mod.0, scope = list(lower = mod.0, upper = model0), direction = "forward") 
n <- length(train$selling_price)
step(model0, scope = list(lower = mod.0, upper = model1), direction = 'backward',k = log(n), trace = 0)
## 
## Call:
## lm(formula = selling_price ~ age + brand + km_driven + fuel + 
##     seller_type + transmission + owner + max_power_bhp + mileage_kmpl, 
##     data = train)
## 
## Coefficients:
##                 (Intercept)                          age  
##                3102019.2910                  -42192.1610  
##              brandChevrolet                  brandDatsun  
##               -2733398.4212                -2765912.0782  
##                   brandFord                   brandHonda  
##               -2666546.4486                -2657058.4051  
##                brandHyundai                  brandJaguar  
##               -2662992.1031                -1213703.0368  
##               brandMahindra                  brandMaruti  
##               -2638380.5967                -2575923.5655  
##          brandMercedes-Benz                  brandNissan  
##               -1515664.3280                -2685859.6358  
##                brandRenault                   brandSkoda  
##               -2683177.1011                -2694586.7050  
##                   brandTata                  brandToyota  
##               -2768618.2629                -2325529.1950  
##             brandVolkswagen                   brandVolvo  
##               -2745160.0468                 -854009.7271  
##                   km_driven                   fuelDiesel  
##                     -0.9175                  140544.6275  
##                     fuelLPG                   fuelPetrol  
##                 136194.9250                   17761.8281  
##       seller_typeIndividual  seller_typeTrustmark Dealer  
##                 -53513.9336                  -59434.7742  
##          transmissionManual    ownerFourth & Above Owner  
##                 -92733.5629                  -10735.2800  
##           ownerSecond Owner          ownerTest Drive Car  
##                 -57459.7598                  637694.1058  
##            ownerThird Owner                max_power_bhp  
##                 -33411.5063                    6420.6067  
##                mileage_kmpl  
##                  -4873.5412

Forward BIC

n <- length(train$selling_price)
step(model0, scope = list(lower = mod.0, upper = model1), direction = 'forward',k = log(n), trace = 0)
## 
## Call:
## lm(formula = selling_price ~ age + brand + km_driven + fuel + 
##     seller_type + transmission + owner + seats + engine_cc + 
##     max_power_bhp + mileage_kmpl, data = train)
## 
## Coefficients:
##                 (Intercept)                          age  
##                3061483.6057                  -42063.6177  
##              brandChevrolet                  brandDatsun  
##               -2737092.6212                -2771706.5090  
##                   brandFord                   brandHonda  
##               -2671164.0142                -2662256.4913  
##                brandHyundai                  brandJaguar  
##               -2665701.4893                -1212964.8030  
##               brandMahindra                  brandMaruti  
##               -2650663.2939                -2579102.2223  
##          brandMercedes-Benz                  brandNissan  
##               -1518746.6564                -2690308.7353  
##                brandRenault                   brandSkoda  
##               -2686283.0522                -2698990.6653  
##                   brandTata                  brandToyota  
##               -2772964.4667                -2339451.0631  
##             brandVolkswagen                   brandVolvo  
##               -2747436.3208                 -853489.6602  
##                   km_driven                   fuelDiesel  
##                     -0.9277                  138921.8246  
##                     fuelLPG                   fuelPetrol  
##                 142308.8343                   23954.8494  
##       seller_typeIndividual  seller_typeTrustmark Dealer  
##                 -53617.6108                  -59969.6796  
##          transmissionManual    ownerFourth & Above Owner  
##                 -91904.8828                  -10240.8789  
##           ownerSecond Owner          ownerTest Drive Car  
##                 -57342.3457                  636677.8250  
##            ownerThird Owner                        seats  
##                 -33109.3068                     451.4033  
##                   engine_cc                max_power_bhp  
##                     22.7107                    6289.8049  
##                mileage_kmpl  
##                  -3892.6335

Anova to Compare Backward and Forward Model

model_back <- lm(log(selling_price) ~ age + brand + km_driven + fuel + 
    seller_type + transmission + owner + max_power_bhp + engine_cc + seats, 
    data = train)

model_forward <- lm(log(selling_price) ~ age + brand + km_driven + fuel + 
    seller_type + transmission + owner + seats + engine_cc + 
    max_power_bhp + mileage_kmpl,data = train)
anova(model_back,model_forward)

Model after removing insignificant variable

model3

model_sub <- lm(log(selling_price) ~ age + brand + km_driven + fuel + seller_type + 
    transmission + owner + max_power_bhp + engine_cc + seats, data = train)
summary(model_sub)
## 
## Call:
## lm(formula = log(selling_price) ~ age + brand + km_driven + fuel + 
##     seller_type + transmission + owner + max_power_bhp + engine_cc + 
##     seats, data = train)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.33270 -0.14294  0.01098  0.15852  1.77240 
## 
## Coefficients:
##                                   Estimate     Std. Error  t value
## (Intercept)                 13.01536516209  0.05281790747  246.420
## age                         -0.11375660148  0.00101795252 -111.750
## brandChevrolet              -0.88882464124  0.03412850026  -26.043
## brandDatsun                 -0.86793271727  0.04471249182  -19.411
## brandFord                   -0.62435269230  0.03127177028  -19.965
## brandHonda                  -0.47151209898  0.03025903746  -15.583
## brandHyundai                -0.53402092293  0.02955322220  -18.070
## brandJaguar                 -0.10016839564  0.03835349275   -2.612
## brandMahindra               -0.67866807750  0.03184548274  -21.311
## brandMaruti                 -0.48052820564  0.03018024011  -15.922
## brandMercedes-Benz          -0.12153438725  0.04207262136   -2.889
## brandNissan                 -0.56745037321  0.03974738333  -14.276
## brandRenault                -0.60723962439  0.03407468822  -17.821
## brandSkoda                  -0.56937580843  0.03663791913  -15.541
## brandTata                   -0.95628032895  0.03050694640  -31.346
## brandToyota                 -0.31637682809  0.03178645250   -9.953
## brandVolkswagen             -0.59937749497  0.03354802914  -17.866
## brandVolvo                  -0.11715221646  0.03890360333   -3.011
## km_driven                   -0.00000068017  0.00000007289   -9.332
## fuelDiesel                   0.32042092384  0.03768904883    8.502
## fuelLPG                      0.17806342174  0.05690954868    3.129
## fuelPetrol                   0.08316591712  0.03759840327    2.212
## seller_typeIndividual       -0.03618362669  0.00975871256   -3.708
## seller_typeTrustmark Dealer -0.01148104238  0.02012727509   -0.570
## transmissionManual          -0.06079112370  0.01208010077   -5.032
## ownerFourth & Above Owner   -0.14489219553  0.02146066560   -6.752
## ownerSecond Owner           -0.08236333284  0.00752058599  -10.952
## ownerTest Drive Car          0.32041910687  0.17206617855    1.862
## ownerThird Owner            -0.12322823567  0.01289507796   -9.556
## max_power_bhp                0.00838467109  0.00017080902   49.088
## engine_cc                    0.00020496403  0.00001470581   13.938
## seats                        0.04007367034  0.00478555924    8.374
##                                         Pr(>|t|)    
## (Intercept)                 < 0.0000000000000002 ***
## age                         < 0.0000000000000002 ***
## brandChevrolet              < 0.0000000000000002 ***
## brandDatsun                 < 0.0000000000000002 ***
## brandFord                   < 0.0000000000000002 ***
## brandHonda                  < 0.0000000000000002 ***
## brandHyundai                < 0.0000000000000002 ***
## brandJaguar                             0.009028 ** 
## brandMahindra               < 0.0000000000000002 ***
## brandMaruti                 < 0.0000000000000002 ***
## brandMercedes-Benz                      0.003881 ** 
## brandNissan                 < 0.0000000000000002 ***
## brandRenault                < 0.0000000000000002 ***
## brandSkoda                  < 0.0000000000000002 ***
## brandTata                   < 0.0000000000000002 ***
## brandToyota                 < 0.0000000000000002 ***
## brandVolkswagen             < 0.0000000000000002 ***
## brandVolvo                              0.002610 ** 
## km_driven                   < 0.0000000000000002 ***
## fuelDiesel                  < 0.0000000000000002 ***
## fuelLPG                                 0.001762 ** 
## fuelPetrol                              0.027002 *  
## seller_typeIndividual                   0.000211 ***
## seller_typeTrustmark Dealer             0.568410    
## transmissionManual               0.0000004967817 ***
## ownerFourth & Above Owner        0.0000000000158 ***
## ownerSecond Owner           < 0.0000000000000002 ***
## ownerTest Drive Car                     0.062619 .  
## ownerThird Owner            < 0.0000000000000002 ***
## max_power_bhp               < 0.0000000000000002 ***
## engine_cc                   < 0.0000000000000002 ***
## seats                       < 0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2425 on 6909 degrees of freedom
## Multiple R-squared:  0.9091, Adjusted R-squared:  0.9087 
## F-statistic:  2230 on 31 and 6909 DF,  p-value: < 0.00000000000000022
par(mfrow = c(2,2))
plot(model_sub, which = 1:4)

ncvTest(model_sub)
## Non-constant Variance Score Test 
## Variance formula: ~ fitted.values 
## Chisquare = 321.3591, Df = 1, p = < 0.000000000000000222
ad.test(resid(model_sub))
## 
##  Anderson-Darling normality test
## 
## data:  resid(model_sub)
## A = 15.954, p-value < 0.00000000000000022
vif(model_sub)
##                   GVIF Df GVIF^(1/(2*Df))
## age           1.817830  1        1.348269
## brand         8.501065 16        1.069168
## km_driven     1.620422  1        1.272958
## fuel          1.796055  3        1.102520
## seller_type   1.539902  2        1.113969
## transmission  1.837740  1        1.355633
## owner         1.427641  4        1.045508
## max_power_bhp 3.874543  1        1.968386
## engine_cc     6.172238  1        2.484399
## seats         2.478445  1        1.574308

Leverage points

p <- ncol(train) - 1
n <- nrow(train)
nyc.hats <- hatvalues(model_sub)
sum(nyc.hats)
## [1] 32
nyc.std <- rstandard(model_sub)

plot(hatvalues(model_sub), rstandard(model_sub),
xlab='Leverage', ylab='Standardized Residuals')
abline(v = 3*(p+1)/n , lty = 2, lwd = 2, col = "red")
abline(h = c(-2, 2), lty = 2, lwd = 2, col = "blue")

model4

influenceIndexPlot(model_sub)

train$rownames <- rownames(train)
train_new <- train[!train$rownames %in% c(1811,8043),]


model3 <- lm(log(train_new$selling_price) ~ age + brand + km_driven + fuel + seller_type + 
    transmission + owner + max_power_bhp + engine_cc + seats ,data = train_new)
summary(model3)
## 
## Call:
## lm(formula = log(train_new$selling_price) ~ age + brand + km_driven + 
##     fuel + seller_type + transmission + owner + max_power_bhp + 
##     engine_cc + seats, data = train_new)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.33035 -0.14322  0.00968  0.15902  1.19459 
## 
## Coefficients:
##                                   Estimate     Std. Error  t value
## (Intercept)                 13.02233392528  0.05260083234  247.569
## age                         -0.11364691633  0.00102649202 -110.714
## brandChevrolet              -0.88514134033  0.03397876466  -26.050
## brandDatsun                 -0.86559390742  0.04451407491  -19.445
## brandFord                   -0.62240499283  0.03113570004  -19.990
## brandHonda                  -0.46807927854  0.03013314627  -15.534
## brandHyundai                -0.53138360875  0.02942702348  -18.058
## brandJaguar                 -0.10005489414  0.03817988155   -2.621
## brandMahindra               -0.67668038378  0.03170265998  -21.345
## brandMaruti                 -0.47831172601  0.03004972112  -15.917
## brandMercedes-Benz          -0.11995582609  0.04188307818   -2.864
## brandNissan                 -0.56515129771  0.03957374184  -14.281
## brandRenault                -0.60428557207  0.03392945927  -17.810
## brandSkoda                  -0.56621590180  0.03648137001  -15.521
## brandTata                   -0.95324828295  0.03037560519  -31.382
## brandToyota                 -0.31134213248  0.03166041134   -9.834
## brandVolkswagen             -0.59709057474  0.03340295261  -17.875
## brandVolvo                  -0.11911103542  0.03872947690   -3.075
## km_driven                   -0.00000079511  0.00000008001   -9.938
## fuelDiesel                   0.32209747170  0.03751971291    8.585
## fuelLPG                      0.17980083124  0.05665248347    3.174
## fuelPetrol                   0.08136316798  0.03743320358    2.174
## seller_typeIndividual       -0.03469555730  0.00972096195   -3.569
## seller_typeTrustmark Dealer -0.01227408108  0.02003644214   -0.613
## transmissionManual          -0.05974762545  0.01202659808   -4.968
## ownerFourth & Above Owner   -0.14123604346  0.02136898156   -6.609
## ownerSecond Owner           -0.08110903269  0.00749472146  -10.822
## ownerTest Drive Car          0.31705973078  0.17128824151    1.851
## ownerThird Owner            -0.11970570061  0.01284936147   -9.316
## max_power_bhp                0.00833691315  0.00017018996   48.986
## engine_cc                    0.00021037632  0.00001466109   14.349
## seats                        0.03839224170  0.00477306167    8.044
##                                         Pr(>|t|)    
## (Intercept)                 < 0.0000000000000002 ***
## age                         < 0.0000000000000002 ***
## brandChevrolet              < 0.0000000000000002 ***
## brandDatsun                 < 0.0000000000000002 ***
## brandFord                   < 0.0000000000000002 ***
## brandHonda                  < 0.0000000000000002 ***
## brandHyundai                < 0.0000000000000002 ***
## brandJaguar                             0.008796 ** 
## brandMahindra               < 0.0000000000000002 ***
## brandMaruti                 < 0.0000000000000002 ***
## brandMercedes-Benz                      0.004195 ** 
## brandNissan                 < 0.0000000000000002 ***
## brandRenault                < 0.0000000000000002 ***
## brandSkoda                  < 0.0000000000000002 ***
## brandTata                   < 0.0000000000000002 ***
## brandToyota                 < 0.0000000000000002 ***
## brandVolkswagen             < 0.0000000000000002 ***
## brandVolvo                              0.002110 ** 
## km_driven                   < 0.0000000000000002 ***
## fuelDiesel                  < 0.0000000000000002 ***
## fuelLPG                                 0.001511 ** 
## fuelPetrol                              0.029772 *  
## seller_typeIndividual                   0.000361 ***
## seller_typeTrustmark Dealer             0.540169    
## transmissionManual           0.00000069288170521 ***
## ownerFourth & Above Owner    0.00000000004146622 ***
## ownerSecond Owner           < 0.0000000000000002 ***
## ownerTest Drive Car                     0.064208 .  
## ownerThird Owner            < 0.0000000000000002 ***
## max_power_bhp               < 0.0000000000000002 ***
## engine_cc                   < 0.0000000000000002 ***
## seats                        0.00000000000000102 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2414 on 6907 degrees of freedom
## Multiple R-squared:   0.91,  Adjusted R-squared:  0.9096 
## F-statistic:  2252 on 31 and 6907 DF,  p-value: < 0.00000000000000022
par(mfrow = c(2,2))
plot(model3,1:4)

nrow(train)  - nrow(train_new)
## [1] 2
influenceIndexPlot(model3)

ad.test(rstandard(model3))
## 
##  Anderson-Darling normality test
## 
## data:  rstandard(model3)
## A = 15.119, p-value < 0.00000000000000022
ncvTest(model3)
## Non-constant Variance Score Test 
## Variance formula: ~ fitted.values 
## Chisquare = 284.682, Df = 1, p = < 0.000000000000000222

Weighted Least Square model4

wts <- 1/fitted(lm(abs(residuals(model3)) ~ train_new$age  + train_new$km_driven + train_new$fuel + train_new$brand + 
    train_new$seller_type + train_new$transmission + train_new$owner + train_new$engine_cc + train_new$max_power_bhp + train_new$seats ))^2

model4 <- lm(log(selling_price) ~age + brand + km_driven + fuel + seller_type + 
    transmission + owner + max_power_bhp + engine_cc + seats ,data = train_new, weights = wts)
summary(model4)
## 
## Call:
## lm(formula = log(selling_price) ~ age + brand + km_driven + fuel + 
##     seller_type + transmission + owner + max_power_bhp + engine_cc + 
##     seats, data = train_new, weights = wts)
## 
## Weighted Residuals:
##     Min      1Q  Median      3Q     Max 
## -8.9443 -0.7879  0.0879  0.8481  7.5929 
## 
## Coefficients:
##                                   Estimate     Std. Error  t value
## (Intercept)                 12.86929111766  0.04328225451  297.334
## age                         -0.11144513709  0.00109492909 -101.783
## brandChevrolet              -0.85693028700  0.03772147502  -22.717
## brandDatsun                 -0.80919189675  0.04302115385  -18.809
## brandFord                   -0.60942297344  0.03374022495  -18.062
## brandHonda                  -0.45467246751  0.03265215052  -13.925
## brandHyundai                -0.51095508730  0.03220131693  -15.868
## brandJaguar                 -0.12579404392  0.03573367751   -3.520
## brandMahindra               -0.67916694401  0.03415687225  -19.884
## brandMaruti                 -0.44511486583  0.03265517021  -13.631
## brandMercedes-Benz          -0.13282606737  0.04645151648   -2.859
## brandNissan                 -0.53892802742  0.03923312231  -13.737
## brandRenault                -0.56991191045  0.03525421407  -16.166
## brandSkoda                  -0.55213014465  0.03594755580  -15.359
## brandTata                   -0.91513532321  0.03427608577  -26.699
## brandToyota                 -0.35174919625  0.03404701190  -10.331
## brandVolkswagen             -0.60108843041  0.03603536658  -16.681
## brandVolvo                  -0.16121133740  0.02863378818   -5.630
## km_driven                   -0.00000082688  0.00000008133  -10.167
## fuelDiesel                   0.33156921750  0.02028136967   16.348
## fuelLPG                      0.19733134490  0.04881256462    4.043
## fuelPetrol                   0.08484892016  0.02028974721    4.182
## seller_typeIndividual       -0.01412861895  0.00651577913   -2.168
## seller_typeTrustmark Dealer  0.00994122314  0.01397883190    0.711
## transmissionManual          -0.05431771848  0.01004404071   -5.408
## ownerFourth & Above Owner   -0.14475755425  0.02578514446   -5.614
## ownerSecond Owner           -0.07051815776  0.00692428385  -10.184
## ownerTest Drive Car          0.30674188674  0.06071262280    5.052
## ownerThird Owner            -0.11609232801  0.01413127795   -8.215
## max_power_bhp                0.00912165092  0.00014554820   62.671
## engine_cc                    0.00024156756  0.00001330299   18.159
## seats                        0.03382609170  0.00453509744    7.459
##                                         Pr(>|t|)    
## (Intercept)                 < 0.0000000000000002 ***
## age                         < 0.0000000000000002 ***
## brandChevrolet              < 0.0000000000000002 ***
## brandDatsun                 < 0.0000000000000002 ***
## brandFord                   < 0.0000000000000002 ***
## brandHonda                  < 0.0000000000000002 ***
## brandHyundai                < 0.0000000000000002 ***
## brandJaguar                             0.000434 ***
## brandMahindra               < 0.0000000000000002 ***
## brandMaruti                 < 0.0000000000000002 ***
## brandMercedes-Benz                      0.004256 ** 
## brandNissan                 < 0.0000000000000002 ***
## brandRenault                < 0.0000000000000002 ***
## brandSkoda                  < 0.0000000000000002 ***
## brandTata                   < 0.0000000000000002 ***
## brandToyota                 < 0.0000000000000002 ***
## brandVolkswagen             < 0.0000000000000002 ***
## brandVolvo                  0.000000018716836051 ***
## km_driven                   < 0.0000000000000002 ***
## fuelDiesel                  < 0.0000000000000002 ***
## fuelLPG                     0.000053428475903489 ***
## fuelPetrol                  0.000029270716099603 ***
## seller_typeIndividual                   0.030165 *  
## seller_typeTrustmark Dealer             0.477007    
## transmissionManual          0.000000065884386340 ***
## ownerFourth & Above Owner   0.000000020538966060 ***
## ownerSecond Owner           < 0.0000000000000002 ***
## ownerTest Drive Car         0.000000447590480613 ***
## ownerThird Owner            0.000000000000000251 ***
## max_power_bhp               < 0.0000000000000002 ***
## engine_cc                   < 0.0000000000000002 ***
## seats                       0.000000000000098043 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.269 on 6907 degrees of freedom
## Multiple R-squared:  0.9678, Adjusted R-squared:  0.9676 
## F-statistic:  6695 on 31 and 6907 DF,  p-value: < 0.00000000000000022
par(mfrow = c(2,2))
plot(model4,1:4)

influenceIndexPlot(model4)

library(nortest)
ad.test(rstandard(model4))
## 
##  Anderson-Darling normality test
## 
## data:  rstandard(model4)
## A = 11.783, p-value < 0.00000000000000022
ncvTest(model4)
## Non-constant Variance Score Test 
## Variance formula: ~ fitted.values 
## Chisquare = 3.826009, Df = 1, p = 0.050463
hist(resid(model4))

vif(model4)
##                    GVIF Df GVIF^(1/(2*Df))
## age            2.423701  1        1.556824
## brand         39.938858 16        1.122131
## km_driven      2.885549  1        1.698690
## fuel           2.372212  3        1.154850
## seller_type    3.184873  2        1.335897
## transmission   5.701131  1        2.387704
## owner          1.668984  4        1.066121
## max_power_bhp  8.033941  1        2.834421
## engine_cc      8.304141  1        2.881691
## seats          2.367013  1        1.538510

model5

wts <- 1/fitted(lm(abs(residuals(model3)) ~ train_new$age  + train_new$km_driven + train_new$fuel + train_new$brand + 
    train_new$seller_type + train_new$transmission + train_new$owner  + train_new$max_power_bhp + train_new$seats ))^2

model5 <- lm(log(selling_price) ~age + brand + km_driven + fuel + seller_type + 
    transmission + owner + max_power_bhp  + seats ,data = train_new, weights = wts)
summary(model4)
## 
## Call:
## lm(formula = log(selling_price) ~ age + brand + km_driven + fuel + 
##     seller_type + transmission + owner + max_power_bhp + engine_cc + 
##     seats, data = train_new, weights = wts)
## 
## Weighted Residuals:
##     Min      1Q  Median      3Q     Max 
## -8.9443 -0.7879  0.0879  0.8481  7.5929 
## 
## Coefficients:
##                                   Estimate     Std. Error  t value
## (Intercept)                 12.86929111766  0.04328225451  297.334
## age                         -0.11144513709  0.00109492909 -101.783
## brandChevrolet              -0.85693028700  0.03772147502  -22.717
## brandDatsun                 -0.80919189675  0.04302115385  -18.809
## brandFord                   -0.60942297344  0.03374022495  -18.062
## brandHonda                  -0.45467246751  0.03265215052  -13.925
## brandHyundai                -0.51095508730  0.03220131693  -15.868
## brandJaguar                 -0.12579404392  0.03573367751   -3.520
## brandMahindra               -0.67916694401  0.03415687225  -19.884
## brandMaruti                 -0.44511486583  0.03265517021  -13.631
## brandMercedes-Benz          -0.13282606737  0.04645151648   -2.859
## brandNissan                 -0.53892802742  0.03923312231  -13.737
## brandRenault                -0.56991191045  0.03525421407  -16.166
## brandSkoda                  -0.55213014465  0.03594755580  -15.359
## brandTata                   -0.91513532321  0.03427608577  -26.699
## brandToyota                 -0.35174919625  0.03404701190  -10.331
## brandVolkswagen             -0.60108843041  0.03603536658  -16.681
## brandVolvo                  -0.16121133740  0.02863378818   -5.630
## km_driven                   -0.00000082688  0.00000008133  -10.167
## fuelDiesel                   0.33156921750  0.02028136967   16.348
## fuelLPG                      0.19733134490  0.04881256462    4.043
## fuelPetrol                   0.08484892016  0.02028974721    4.182
## seller_typeIndividual       -0.01412861895  0.00651577913   -2.168
## seller_typeTrustmark Dealer  0.00994122314  0.01397883190    0.711
## transmissionManual          -0.05431771848  0.01004404071   -5.408
## ownerFourth & Above Owner   -0.14475755425  0.02578514446   -5.614
## ownerSecond Owner           -0.07051815776  0.00692428385  -10.184
## ownerTest Drive Car          0.30674188674  0.06071262280    5.052
## ownerThird Owner            -0.11609232801  0.01413127795   -8.215
## max_power_bhp                0.00912165092  0.00014554820   62.671
## engine_cc                    0.00024156756  0.00001330299   18.159
## seats                        0.03382609170  0.00453509744    7.459
##                                         Pr(>|t|)    
## (Intercept)                 < 0.0000000000000002 ***
## age                         < 0.0000000000000002 ***
## brandChevrolet              < 0.0000000000000002 ***
## brandDatsun                 < 0.0000000000000002 ***
## brandFord                   < 0.0000000000000002 ***
## brandHonda                  < 0.0000000000000002 ***
## brandHyundai                < 0.0000000000000002 ***
## brandJaguar                             0.000434 ***
## brandMahindra               < 0.0000000000000002 ***
## brandMaruti                 < 0.0000000000000002 ***
## brandMercedes-Benz                      0.004256 ** 
## brandNissan                 < 0.0000000000000002 ***
## brandRenault                < 0.0000000000000002 ***
## brandSkoda                  < 0.0000000000000002 ***
## brandTata                   < 0.0000000000000002 ***
## brandToyota                 < 0.0000000000000002 ***
## brandVolkswagen             < 0.0000000000000002 ***
## brandVolvo                  0.000000018716836051 ***
## km_driven                   < 0.0000000000000002 ***
## fuelDiesel                  < 0.0000000000000002 ***
## fuelLPG                     0.000053428475903489 ***
## fuelPetrol                  0.000029270716099603 ***
## seller_typeIndividual                   0.030165 *  
## seller_typeTrustmark Dealer             0.477007    
## transmissionManual          0.000000065884386340 ***
## ownerFourth & Above Owner   0.000000020538966060 ***
## ownerSecond Owner           < 0.0000000000000002 ***
## ownerTest Drive Car         0.000000447590480613 ***
## ownerThird Owner            0.000000000000000251 ***
## max_power_bhp               < 0.0000000000000002 ***
## engine_cc                   < 0.0000000000000002 ***
## seats                       0.000000000000098043 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.269 on 6907 degrees of freedom
## Multiple R-squared:  0.9678, Adjusted R-squared:  0.9676 
## F-statistic:  6695 on 31 and 6907 DF,  p-value: < 0.00000000000000022
par(mfrow = c(2,2))
plot(model4,1:4)

influenceIndexPlot(model5)

ad.test(rstandard(model5))
## 
##  Anderson-Darling normality test
## 
## data:  rstandard(model5)
## A = 8.4198, p-value < 0.00000000000000022
ncvTest(model5)
## Non-constant Variance Score Test 
## Variance formula: ~ fitted.values 
## Chisquare = 3.135075, Df = 1, p = 0.076625
hist(resid(model5))
vif(model5)
##                    GVIF Df GVIF^(1/(2*Df))
## age            2.506426  1        1.583170
## brand         36.812509 16        1.119276
## km_driven      3.288264  1        1.813357
## fuel           2.268820  3        1.146305
## seller_type    3.855163  2        1.401234
## transmission   6.989814  1        2.643826
## owner          1.730053  4        1.070921
## max_power_bhp  6.703728  1        2.589156
## seats          2.001459  1        1.414729

Model Validation

train_new['predict_selling_price'] <- exp(predict(model5,train_new))
train_new['percent_diff'] <- abs((train_new['selling_price'] - train_new['predict_selling_price'] )/train_new['selling_price']) * 100

ggplot(train_new, aes(x = selling_price, y = predict_selling_price)) + geom_point(size=2, shape=23) + geom_smooth(method = "lm", se=FALSE, color="red", formula = y ~ x, size=1,fullrange=TRUE) + stat_cor(method = "pearson") + scale_y_continuous(labels = scales::comma) + scale_x_continuous(labels = scales::comma) + ggtitle("Train Actual Predicted Correlation")

test['predict_selling_price'] <- exp(predict(model5,test))
test['percent_diff'] <- abs((test['selling_price'] - test['predict_selling_price'] )/test['selling_price']) * 100

ggplot(test, aes(x = selling_price, y = predict_selling_price)) + geom_point(size=2, shape=23) + geom_smooth(method = "lm", se=FALSE, color="red", formula = y ~ x, size=1,fullrange=TRUE) + stat_cor(method = "pearson") + scale_y_continuous(labels = scales::comma) + scale_x_continuous(labels = scales::comma) + ggtitle("Test Actual Predicted Correlation")

Variance Importance Plot

hist((train$km_driven))

boxplot(train$km_driven)

summary(train$km_driven)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       1   35000   60000   69294   96443 1500000
library(caret)
## Loading required package: lattice
var_imp <- varImp(model4)

ggplot(var_imp, aes(x= Overall, y=reorder(rownames(var_imp),Overall), fill=rownames(var_imp) )) +
  geom_bar(stat="identity")+theme_minimal() + guides(fill=FALSE) + labs(title=paste0("Variable Importance Plot"),y= "features") + xlab("score")